raw.dta<-read_csv("../data/FSA_data_competition_2020.csv", na = c("NA", ":", "unclassified", "unknown")) %>% 
         rename("ID" = "ID", 
                "date_added" = "Date Added", 
                "date_published" = "Date of Publishing", 
                "data_source" = "Data Source", 
                "source_type" = "Source Type", 
                "alert_type" = "Alert Type", 
                "raw_text_product" = "Raw Product Phrase", 
                "product_categoty" = "Product Category", 
                "product" = "Commodity / Product", 
                "origin_country" = "Country of Origin", 
                "origin_country_EU" = "Eu/non-EU Country of Origin",
                "notified_country" = "Notified by", 
                "notified_country_EU" = "EU/non-EU Notifying Country",
                "incident_title" = "Incident Title", 
                "hazard_description" = "Hazard Description", # can extract about ecoli fro here
                "hazard_group" = "Hazard Group", 
                "summary" = "Summary", 
                "link" = "Link", 
                "food_feed_fcm" = "Food; Feed or FCM", 
                "manufacturer" = "Manufacturer", 
                "brand" = "Brand", 
                "organisation" = "Organisations", 
                "food_or_not" = "Is A Food Article" )

# basic tidy

data <- raw.dta %>% 
        select(-food_or_not, -incident_title) %>% 
        mutate(food_feed_fcm = ifelse(food_feed_fcm == 'FCM', 'fcm', 
                               ifelse(food_feed_fcm =='Food', 'food', food_feed_fcm))) %>% 
        filter(food_feed_fcm != "fcm")
data %>% arrange(date_published) %>% vis_miss()

# new cols
data <- data %>%
        # tidy up dates using lubridate
        mutate(date_added = dmy(date_added), 
                date_published = dmy(date_published)) %>% 
        mutate(date_added_year = year(date_added),
               date_published_year = year(date_published)) %>% 
        mutate(date_published_month = ifelse(nchar(month(date_published)) == 2, month(date_published), paste0(0, month(date_published))),
               # create year_month
               date_published_year_month = paste0(date_published_year, "-", date_published_month), 
               #create year_quarter
               date_published_quarter = ifelse(date_published_month %in% c("01", "02", "03"), "Q1",
                                       ifelse(date_published_month %in% c("04", "05", "06"), "Q2",
                                       ifelse(date_published_month %in% c("07", "08", "09"), "Q3",
                                       ifelse(date_published_month %in% c("10", "11", "12"), "Q4", NA)))),
               date_published_year_quarter = paste0(date_published_year, "-", date_published_quarter),
               date_published_fulltime = paste0(date_published, " 00:00:00 +00:00")) %>% 
        # create incident ID
        separate(ID, into= c("ID", "ID_incident"), sep= "-", remove=F) %>% 
        mutate(ID_incident = ifelse(is.na(ID_incident), ID, ID_incident))


# 2015-01-15 19:05:39 +00:00


data %>% count(date_added_year)
## # A tibble: 2 x 2
##   date_added_year     n
##             <dbl> <int>
## 1            2019 21876
## 2            2020 10244
data %>% count(date_published_year)
## # A tibble: 6 x 2
##   date_published_year     n
##                 <dbl> <int>
## 1                2016  4306
## 2                2017  6497
## 3                2018  8228
## 4                2019 10149
## 5                2020  2937
## 6                  NA     3
# update alert types into more general categories, aso some of them are "translation-guided" but mean the same thing

data %>% 
  count(alert_type,sort=T)
## # A tibble: 10 x 2
##    alert_type                    n
##    <chr>                     <int>
##  1 recall                    14064
##  2 border rejection           5504
##  3 alert                      3649
##  4 information for attention  2692
##  5 update                     2082
##  6 information for follow-up  1941
##  7 outbreak                   1863
##  8 warning                     182
##  9 information                 109
## 10 lookout                      34
data %>% 
  count(alert_type, data_source, sort=T)
## # A tibble: 55 x 3
##    alert_type                data_source                                       n
##    <chr>                     <chr>                                         <int>
##  1 border rejection          RASFF Portal                                   3654
##  2 alert                     RASFF Portal                                   3649
##  3 recall                    CDPH Recalls (Canada)                          3269
##  4 information for attention RASFF Portal                                   2662
##  5 information for follow-up RASFF Portal                                   1941
##  6 border rejection          Ministry of Health - Border Rejections (Japa…  1850
##  7 update                    CDPH Recalls (Canada)                          1806
##  8 recall                    MAPAQ (Canada)                                 1200
##  9 recall                    Food Poisoning Bulletin (US)                   1199
## 10 recall                    FSA Alerts & Recalls (UK)                      1183
## # … with 45 more rows
data %>% filter(alert_type=="alert") %>% View

data<-data %>% 
  mutate(alert_type_large = ifelse(alert_type == "lookout", "alert", # italy-specific
                      ifelse(alert_type =="warning", "alert", # belgium-specific
                      ifelse(alert_type =="information", "alert", # norway specific
                      ifelse(alert_type =="outbreak", "alert", # USA specific
                      ifelse(alert_type =="information for follow-up", "update", # RAFSS term fro update
                         alert_type))))))
data %>% count(product) %>% arrange(-n)
## # A tibble: 3,490 x 2
##    product             n
##    <chr>           <int>
##  1 <NA>             1973
##  2 chicken          1951
##  3 bakery product   1935
##  4 beef             1681
##  5 meat product      995
##  6 pepper            816
##  7 food supplement   720
##  8 cheese            660
##  9 pork              644
## 10 sesame            540
## # … with 3,480 more rows
data %>% count(alert_type) %>% arrange(-n)
## # A tibble: 10 x 2
##    alert_type                    n
##    <chr>                     <int>
##  1 recall                    14064
##  2 border rejection           5504
##  3 alert                      3649
##  4 information for attention  2692
##  5 update                     2082
##  6 information for follow-up  1941
##  7 outbreak                   1863
##  8 warning                     182
##  9 information                 109
## 10 lookout                      34
data %>% count(data_source) %>% arrange(-n)
## # A tibble: 43 x 2
##    data_source                                        n
##    <chr>                                          <int>
##  1 RASFF Portal                                   11906
##  2 CDPH Recalls (Canada)                           5075
##  3 Food Poisoning Bulletin (US)                    2133
##  4 Ministry of Health - Border Rejections (Japan)  1850
##  5 FoodSafetyNews.com                              1495
##  6 FSA Alerts & Recalls (UK)                       1427
##  7 MAPAQ (Canada)                                  1230
##  8 FDA Recalls (USA)                                904
##  9 Product Recalls Website: Oulah (France)          896
## 10 AFSCA Recalls (Belgium)                          614
## # … with 33 more rows
data %>% count(hazard_group) %>% arrange(-n)
## # A tibble: 35 x 2
##    hazard_group                       n
##    <chr>                          <int>
##  1 microbial contaminants (other)  5831
##  2 pathogenic micro-organisms      5588
##  3 allergens                       5221
##  4 <NA>                            3171
##  5 foreign bodies                  1896
##  6 composition                     1536
##  7 poor or insufficient controls   1192
##  8 pesticide residues              1149
##  9 heavy metals                     869
## 10 Fraud                            783
## # … with 25 more rows
data %>% count(origin_country) %>% arrange(-n)
## # A tibble: 151 x 2
##    origin_country     n
##    <chr>          <int>
##  1 Canada          6508
##  2 USA             4763
##  3 United Kingdom  1966
##  4 France          1823
##  5 Italy           1130
##  6 China           1040
##  7 Belgium         1018
##  8 Poland           885
##  9 Netherlands      840
## 10 Turkey           826
## # … with 141 more rows
# Hazard categoty overview by Year
dat1<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  filter(n()>300) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year )  %>% 
  group_by(hazard_group, date_published_year) %>% 
  mutate(count = n())%>%
  distinct() 

pc <-ggplot(data = dat1, aes(x = date_published_year  , y = count, group = hazard_group)) +
  geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(10, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) 
pc

# Hazard categoty overview by Month (all records)

dat2<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  filter(n()>500) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year_month )  %>% 
  group_by(hazard_group, date_published_year_month) %>% 
  mutate(count = n())%>%
  distinct() 

pc <-ggplot(data = dat2, aes(x = date_published_year_month  , y = count, group = hazard_group)) +
  geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(9, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc

# Hazard categoty overview by Month (report multiple issue related to one event as one event)

dat3<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  filter(n()>500) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year_month, ID_incident)  %>% distinct() %>% 
  group_by(hazard_group, date_published_year_month) %>% 
  mutate(count = n())%>%
  distinct() 

pc3 <-ggplot(data = dat3, aes(x = date_published_year_month  , y = count, group = hazard_group)) +
  geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(9, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

pc3

# Main 3 categories, with smooth lines

dat4<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  #filter(n()>500) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year_month, ID_incident)  %>% distinct() %>% 
  group_by(hazard_group, date_published_year_month) %>% 
  filter(hazard_group %in% c("microbial contaminants (other)", "pathogenic micro-organisms", "allergens")) %>% 
  mutate(count = n())%>%
  distinct() 

pc4 <-ggplot(data = dat4, aes(x = date_published_year_month  , y = count, group = hazard_group)) +
  #geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  geom_smooth(aes(x = date_published_year_month  , y = count, color = hazard_group))+
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(9, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc4

# + other categories smooth line

dat5<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  filter(n()>500) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year_month, ID_incident)  %>% distinct() %>% 
  group_by(hazard_group, date_published_year_month) %>% 
  #filter(hazard_group %in% c("microbial contaminants (other)", "pathogenic micro-organisms", "allergens")) %>% 
  mutate(count = n())%>%
  distinct() 

pc5 <-ggplot(data = dat5, aes(x = date_published_year_month  , y = count, group = hazard_group)) +
  #geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  geom_smooth(aes(x = date_published_year_month  , y = count, color = hazard_group))+
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(9, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc5

# all foreign bodies

dat7<-data %>% 
  filter(!is.na(hazard_group),
         date_published_year_month != "NA-NA") %>% 
  group_by(hazard_group) %>%
  filter(n()>500) %>% 
  ungroup %>% 
  select(hazard_group, date_published_year_month, ID_incident) %>% distinct() %>% 
  group_by(hazard_group, date_published_year_month) %>% 
  filter(hazard_group == "foreign bodies") %>% 
  mutate(count = n())%>%
  distinct() 

pc7 <-ggplot(data = dat7, aes(x = date_published_year_month  , y = count, group = hazard_group)) +
  #geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
  geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
  geom_smooth(aes(x = date_published_year_month  , y = count, color = hazard_group))+
  #scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)]  )+
  theme(legend.position = "right", ncol=1) +
  #scale_colour_manual(values=c(pal))+
  theme_minimal_hgrid(9, rel_small = 1) +
  labs(x = "year",  colour="hazard group",
       y = "counts",
       title = "")+
    guides(alpha = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

pc7

Add coordinates data

add_coordinates <- function(input){
  
  ## get coordinates
  # get world map
  wmap <- getMap(resolution="high")
  # get centroids
  centroids <- gCentroid(wmap, byid=TRUE)
  # get a data.frame with centroids
  geo_df <- as.data.frame(centroids)
  colnames(geo_df) <- c("long", "lat")
  geo_df <- geo_df %>% tibble::rownames_to_column("country")

  
  # update names in data
  input<- input %>% 
    mutate( origin_country = case_when(origin_country == "USA" ~ "United States of America",
                                            origin_country =="Hong Kong" ~ "Hong Kong S.A.R.",
                                            origin_country =="Serbia" ~ "Republic of Serbia", 
                                            origin_country =="Bosnia Herzegovina" ~ "Bosnia and Herzegovina",
                                            origin_country =="Tanzania" ~ "United Republic of Tanzania",
                                            TRUE ~ origin_country)) %>% 
     mutate( notified_country = case_when(notified_country == "USA" ~ "United States of America",
                                            notified_country =="Hong Kong" ~ "Hong Kong S.A.R.",
                                            notified_country =="Serbia" ~ "Republic of Serbia", 
                                            notified_country =="Bosnia Herzegovina" ~ "Bosnia and Herzegovina",
                                            notified_country =="Tanzania" ~ "United Republic of Tanzania",
                                            TRUE ~ notified_country))  %>% 
    filter(!origin_country %in% c("Palestinian Territories", "INFOSAN" , "Commission Services", NA) | !notified_country %in% c("Palestinian Territories", "INFOSAN" , "Commission Services", NA))
  
  
  # join with coords data
  output <- input %>% 
    left_join(., geo_df, by = c("origin_country" = "country")) %>% 
    rename("lat.origin" = "lat",
           "long.origin" = "long") %>%
    left_join(., geo_df, by = c("notified_country" = "country")) %>% 
    rename("lat.notified" = "lat",
           "long.notified" = "long") %>% drop_na()
  
  return(output)
}



test <- add_coordinates(data)

save all with coords

data %>% 
  select(-link, -brand, -manufacturer,  -organisation, -date_added, -date_added_year) %>% 
  add_coordinates() %>% 
  write_csv("../data/tidy/food_hazards_data_all.csv")

Add random coordinates

library(sp)
library(maptools)
data(wrld_simpl) # load data 

data_all<-read_csv("../data/tidy/food_hazards_data_all.csv")


# update names in data to match source of coordiantes
input<- data_all %>% 
    mutate( origin_country = case_when(origin_country ==  "United States of America" ~ "United States",
                                            origin_country == "Hong Kong S.A.R." ~ "Hong Kong",
                                            origin_country ==  "Republic of Serbia" ~ "Serbia", 
                                            origin_country =="Vietnam" ~ "Viet Nam",
                                            origin_country =="South Korea" ~ "Korea, Republic of",
                                            origin_country =="Iran" ~ "Iran (Islamic Republic of)",
                                            origin_country =="Syria" ~ "Syrian Arab Republic",
                                            origin_country =="Laos" ~ "Lao People's Democratic Republic",
                                            origin_country =="Moldova" ~ "Republic of Moldova",
                                            origin_country =="Brunei" ~ "Brunei Darussalam",
                                            origin_country =="Kosovo" ~ "Serbia",
                                            origin_country =="Myanmar" ~ "Burma",
                                            TRUE ~ origin_country)) %>% 
     mutate( notified_country = case_when(notified_country ==  "United States of America" ~ "United States",
                                            notified_country == "Hong Kong S.A.R." ~ "Hong Kong",
                                            notified_country ==  "Republic of Serbia" ~ "Serbia", 
                                            notified_country =="Vietnam" ~ "Viet Nam",
                                            notified_country =="South Korea" ~ "Korea, Republic of",
                                            notified_country =="Iran" ~ "Iran (Islamic Republic of)",
                                            notified_country =="Syria" ~ "Syrian Arab Republic",
                                            notified_country =="Laos" ~ "Lao People's Democratic Republic",
                                            notified_country =="Moldova" ~ "Republic of Moldova",
                                            notified_country =="Brunei" ~ "Brunei Darussalam",
                                            notified_country =="Kosovo" ~ "Serbia",
                                            notified_country =="Myanmar" ~ "Burma",
                                            TRUE ~ notified_country))
                                            


get_random_cooor_single <- function(country_name){
  
  if (!country_name %in% wrld_simpl$NAME){
    stop(paste0("Polygon not available for ", country_name))
  }
  
  # get country polygon
  sw <- slot(wrld_simpl[wrld_simpl$NAME == country_name,], "polygons")[[1]]
  # sample a random coord (n) within that polygon
  rpoints <- sp::spsample(sw, n = 1, type = "random") %>% as.data.frame() # long-lat
  # convert to vector
  rpoints_vec <- paste(as.numeric(rpoints[1,]), collapse = ",")

  return(rpoints_vec)
}
 

get_random_cooor <- function(country_name, n){
  
  if (!country_name %in% wrld_simpl$NAME){
    stop(paste0("Polygon not available for ", country_name))
  }
  
  # get country polygon
  sw <- slot(wrld_simpl[wrld_simpl$NAME == country_name,], "polygons")[[1]]
  # sample a random coord (n) within that polygon
  rpoints <- sp::spsample(sw, n = n, type = "random") %>% as.data.frame() # long-lat
  colnames(rpoints) <- c("long", "lat")
  
  return(rpoints)
}

## Create random coords for Origin countries

subset_origin <-input %>% select(ID, origin_country)
origin_coords <- data.frame()

for (country in unique(subset_origin$origin_country) ){
  # subset to country
  country_subset<-subset_origin %>% filter(origin_country == country)
  # generate rand coords for n cases
  tmp<-get_random_cooor(country, n = dim(country_subset)[1])
  # addign them to case IDs (order does not matter)
  tmp2<-cbind(country_subset, tmp)
  
  # merge into df that will hold all coords for origin data points
  origin_coords <- rbind(origin_coords, tmp2)
}
colnames(origin_coords)[3:4] <- c('long.origin.rand', 'lat.origin.rand')

## Create random coords for Notified countries

subset_notified <-input %>% select(ID, notified_country)
notified_coords <- data.frame()

for (country in unique(subset_notified$notified_country) ){
  # subset to country
  country_subset<-subset_notified %>% filter(notified_country == country)
  # generate rand coords for n cases
  tmp<-get_random_cooor(country, n = dim(country_subset)[1])
  # addign them to case IDs (order does not matter)
  tmp2<-cbind(country_subset, tmp)
  
  # merge into df that will hold all coords for notified data points
  notified_coords <- rbind(notified_coords, tmp2)
}
colnames(notified_coords)[3:4] <- c('long.notified.rand', 'lat.notified.rand')


## merge new rand coords by case ID inro one df
coords_rand <- full_join(origin_coords, notified_coords, by="ID") %>% 
               select(-origin_country, -notified_country)


## add new random coords to the main dataset 
data_all_upd <- full_join(data_all, coords_rand, by ="ID")

write_csv(data_all_upd, "../data/tidy/food_hazards_data_all.csv")